Team 7 Members: Kunpeng Huang, Yoki Liu, Lyufan Pan, Yunlei Zhou, Jiayuan Zou, Sherry Zuo
## load the packages
library(readr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
library(cluster)
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(tidyverse)
## ── Attaching packages ─────────────────── tidyverse 1.2.1 ──
## ✔ tibble 2.1.3 ✔ stringr 1.4.0
## ✔ tidyr 1.0.0 ✔ forcats 0.4.0
## ── Conflicts ────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(skimr)
##
## Attaching package: 'skimr'
## The following object is masked from 'package:stats':
##
## filter
library(corrplot)
## corrplot 0.84 loaded
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(Rtsne)
library(tidytext)
library(wordcloud)
## Loading required package: RColorBrewer
library(quanteda)
## Package version: 1.5.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
##
## Attaching package: 'tm'
## The following objects are masked from 'package:quanteda':
##
## as.DocumentTermMatrix, stopwords
## load the dataset
data<-read_csv("CC GENERAL.csv")
## Parsed with column specification:
## cols(
## CUST_ID = col_character(),
## BALANCE = col_double(),
## BALANCE_FREQUENCY = col_double(),
## PURCHASES = col_double(),
## ONEOFF_PURCHASES = col_double(),
## INSTALLMENTS_PURCHASES = col_double(),
## CASH_ADVANCE = col_double(),
## PURCHASES_FREQUENCY = col_double(),
## ONEOFF_PURCHASES_FREQUENCY = col_double(),
## PURCHASES_INSTALLMENTS_FREQUENCY = col_double(),
## CASH_ADVANCE_FREQUENCY = col_double(),
## CASH_ADVANCE_TRX = col_double(),
## PURCHASES_TRX = col_double(),
## CREDIT_LIMIT = col_double(),
## PAYMENTS = col_double(),
## MINIMUM_PAYMENTS = col_double(),
## PRC_FULL_PAYMENT = col_double(),
## TENURE = col_double()
## )
colnames(data)<-tolower(colnames(data))
colnames(data)
## [1] "cust_id" "balance"
## [3] "balance_frequency" "purchases"
## [5] "oneoff_purchases" "installments_purchases"
## [7] "cash_advance" "purchases_frequency"
## [9] "oneoff_purchases_frequency" "purchases_installments_frequency"
## [11] "cash_advance_frequency" "cash_advance_trx"
## [13] "purchases_trx" "credit_limit"
## [15] "payments" "minimum_payments"
## [17] "prc_full_payment" "tenure"
##Data Cleaning
cc<-data%>%select(-cust_id)
colmean <- function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))
cc <- replace(cc, TRUE, lapply(cc, colmean))
glimpse(cc)
## Observations: 8,950
## Variables: 17
## $ balance <dbl> 40.90075, 3202.46742, 2495.1488…
## $ balance_frequency <dbl> 0.818182, 0.909091, 1.000000, 0…
## $ purchases <dbl> 95.40, 0.00, 773.17, 1499.00, 1…
## $ oneoff_purchases <dbl> 0.00, 0.00, 773.17, 1499.00, 16…
## $ installments_purchases <dbl> 95.40, 0.00, 0.00, 0.00, 0.00, …
## $ cash_advance <dbl> 0.0000, 6442.9455, 0.0000, 205.…
## $ purchases_frequency <dbl> 0.166667, 0.000000, 1.000000, 0…
## $ oneoff_purchases_frequency <dbl> 0.000000, 0.000000, 1.000000, 0…
## $ purchases_installments_frequency <dbl> 0.083333, 0.000000, 0.000000, 0…
## $ cash_advance_frequency <dbl> 0.000000, 0.250000, 0.000000, 0…
## $ cash_advance_trx <dbl> 0, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ purchases_trx <dbl> 2, 0, 12, 1, 1, 8, 64, 12, 5, 3…
## $ credit_limit <dbl> 1000, 7000, 7500, 7500, 1200, 1…
## $ payments <dbl> 201.8021, 4103.0326, 622.0667, …
## $ minimum_payments <dbl> 139.50979, 1072.34022, 627.2847…
## $ prc_full_payment <dbl> 0.000000, 0.222222, 0.000000, 0…
## $ tenure <dbl> 12, 12, 12, 12, 12, 12, 12, 12,…
skim(cc)
## Skim summary statistics
## n obs: 8950
## n variables: 17
##
## ── Variable type:numeric ───────────────────────────────────
## variable missing complete n mean sd
## balance 0 8950 8950 1564.47 2081.53
## balance_frequency 0 8950 8950 0.88 0.24
## cash_advance 0 8950 8950 978.87 2097.16
## cash_advance_frequency 0 8950 8950 0.14 0.2
## cash_advance_trx 0 8950 8950 3.25 6.82
## credit_limit 0 8950 8950 4494.45 3638.61
## installments_purchases 0 8950 8950 411.07 904.34
## minimum_payments 0 8950 8950 864.21 2330.59
## oneoff_purchases 0 8950 8950 592.44 1659.89
## oneoff_purchases_frequency 0 8950 8950 0.2 0.3
## payments 0 8950 8950 1733.14 2895.06
## prc_full_payment 0 8950 8950 0.15 0.29
## purchases 0 8950 8950 1003.2 2136.63
## purchases_frequency 0 8950 8950 0.49 0.4
## purchases_installments_frequency 0 8950 8950 0.36 0.4
## purchases_trx 0 8950 8950 14.71 24.86
## tenure 0 8950 8950 11.52 1.34
## p0 p25 p50 p75 p100 hist
## 0 128.28 873.39 2054.14 19043.14 ▇▂▁▁▁▁▁▁
## 0 0.89 1 1 1 ▁▁▁▁▁▁▁▇
## 0 0 0 1113.82 47137.21 ▇▁▁▁▁▁▁▁
## 0 0 0 0.22 1.5 ▇▂▁▁▁▁▁▁
## 0 0 0 4 123 ▇▁▁▁▁▁▁▁
## 50 1600 3000 6500 30000 ▇▅▂▁▁▁▁▁
## 0 0 89 468.64 22500 ▇▁▁▁▁▁▁▁
## 0.019 170.86 335.63 864.21 76406.21 ▇▁▁▁▁▁▁▁
## 0 0 38 577.41 40761.25 ▇▁▁▁▁▁▁▁
## 0 0 0.083 0.3 1 ▇▂▁▁▁▁▁▁
## 0 383.28 856.9 1901.13 50721.48 ▇▁▁▁▁▁▁▁
## 0 0 0 0.14 1 ▇▁▁▁▁▁▁▁
## 0 39.63 361.28 1110.13 49039.57 ▇▁▁▁▁▁▁▁
## 0 0.083 0.5 0.92 1 ▇▂▁▂▁▂▁▇
## 0 0 0.17 0.75 1 ▇▁▁▂▁▁▁▃
## 0 1 7 17 358 ▇▁▁▁▁▁▁▁
## 6 12 12 12 12 ▁▁▁▁▁▁▁▇
##There are 17 numeric variables and 8950 observations. Replace NAs with colmeans.
summary(lm(credit_limit~. , cc))
##
## Call:
## lm(formula = credit_limit ~ ., data = cc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10230.7 -1664.2 -722.9 1117.5 23266.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.096e+03 2.717e+02 4.033 5.54e-05
## balance 1.016e+00 1.949e-02 52.109 < 2e-16
## balance_frequency -1.869e+03 1.358e+02 -13.759 < 2e-16
## purchases -3.980e-01 2.918e+00 -0.136 0.89151
## oneoff_purchases 4.945e-01 2.918e+00 0.169 0.86544
## installments_purchases 8.477e-01 2.916e+00 0.291 0.77132
## cash_advance 2.013e-01 2.285e-02 8.808 < 2e-16
## purchases_frequency -6.804e+02 2.325e+02 -2.927 0.00343
## oneoff_purchases_frequency 2.976e+03 1.868e+02 15.931 < 2e-16
## purchases_installments_frequency 6.959e+02 2.202e+02 3.160 0.00158
## cash_advance_frequency -1.374e+03 2.656e+02 -5.172 2.37e-07
## cash_advance_trx -2.205e+01 7.454e+00 -2.958 0.00310
## purchases_trx -1.276e+01 2.024e+00 -6.304 3.04e-10
## payments 1.340e-01 1.596e-02 8.392 < 2e-16
## minimum_payments -1.474e-01 1.348e-02 -10.933 < 2e-16
## prc_full_payment 1.876e+03 1.106e+02 16.967 < 2e-16
## tenure 2.205e+02 2.235e+01 9.866 < 2e-16
##
## (Intercept) ***
## balance ***
## balance_frequency ***
## purchases
## oneoff_purchases
## installments_purchases
## cash_advance ***
## purchases_frequency **
## oneoff_purchases_frequency ***
## purchases_installments_frequency **
## cash_advance_frequency ***
## cash_advance_trx **
## purchases_trx ***
## payments ***
## minimum_payments ***
## prc_full_payment ***
## tenure ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2688 on 8933 degrees of freedom
## Multiple R-squared: 0.4554, Adjusted R-squared: 0.4545
## F-statistic: 466.9 on 16 and 8933 DF, p-value: < 2.2e-16
cc_c<-cor(cc)
corrplot(cc_c,
type="upper",
diag=F,
method="color",
order="hclust")
Based on the correlation plot colors, we think there are more than 4 clusters in our dataset.
##Explore data
ggplot(cc,aes(x=purchases_frequency)) +
geom_histogram()+
theme(panel.background = element_rect(fill="white"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Based on this histigram, this graph shows us that the purchasing polarization. x-axis indicates that how frequently the purchases are being made, 1 = very frequently, and 0 = not very frequently. Most of transcations is concetred on the either not very frequently or very frequently group.
ggplot(cc, aes(x = balance_frequency, y = balance, color = factor(tenure)))+
geom_point(alpha = 0.2)+
theme(panel.background = element_rect(fill="white"))
##cash_advance_frequency with cash_advance_trx
##purchases with purchases_installments_frequency
##purchases_frequency with purchases_installments_frequency
ggplot(cc, aes(x = purchases_installments_frequency, y = purchases, color = purchases_frequency))+
geom_point()+
theme(panel.background = element_rect(fill="white"))
##balance
par(mfrow = c(2,1))
hist(x=cc$balance)
boxplot(cc$balance, horizontal = T)
##balance freq: both withdraw and saving
par(mfrow = c(2,1))
hist(x=cc$balance_frequency)
boxplot(cc$balance_frequency, horizontal = T)
##purchases
par(mfrow = c(2,1))
hist(x=cc$purchases)
boxplot(cc$purchases, horizontal = T)
##purchase freq
par(mfrow = c(2,1))
hist(x=cc$purchases_frequency)
boxplot(cc$purchases_frequency, horizontal = T)
##oneoff purchase: maximum purchase amount
par(mfrow = c(2,1))
hist(x=cc$oneoff_purchases)
boxplot(cc$oneoff_purchases, horizontal = T)
##oneoff purchase frequency: maximum purchase refresh freq
par(mfrow = c(2,1))
hist(x=cc$oneoff_purchases_frequency)
boxplot(cc$oneoff_purchases_frequency, horizontal = T)
##oneoff purchase: maximum installment amount
par(mfrow = c(2,1))
hist(x=cc$installments_purchases)
boxplot(cc$installments_purchases, horizontal = T)
## Hclust
cc_z = scale(cc)
#Manhattan distance:
cc_dm = dist(cc_z, method="manhattan")
#Complete linkage:
clust = hclust(cc_dm)
table(cutree(clust, k=7))
##
## 1 2 3 4 5 6 7
## 8258 653 11 18 1 3 6
sapply(7:13, function(x) table(cutree(clust, k=x)))
## [[1]]
##
## 1 2 3 4 5 6 7
## 8258 653 11 18 1 3 6
##
## [[2]]
##
## 1 2 3 4 5 6 7 8
## 8258 610 43 11 18 1 3 6
##
## [[3]]
##
## 1 2 3 4 5 6 7 8 9
## 8184 610 74 43 11 18 1 3 6
##
## [[4]]
##
## 1 2 3 4 5 6 7 8 9 10
## 8184 610 74 43 11 16 2 1 3 6
##
## [[5]]
##
## 1 2 3 4 5 6 7 8 9 10 11
## 8184 610 74 29 11 14 16 2 1 3 6
##
## [[6]]
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## 8184 54 556 74 29 11 14 16 2 1 3 6
##
## [[7]]
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 8184 54 556 74 29 10 14 16 2 1 3 6 1
##Fit the PCA model
cc_pca=prcomp(cc, center=TRUE, scale=TRUE)
fviz_pca_var(cc_pca, col.var="contrib",
gradient.cols=c("#00AFBB","#E7B800","#FC4E07"),
repel=TRUE)
##we pick purchase, purchases_trx, balance, cash_advance, cash_advance_frequency, cash advance_trx, purchase_frequency, oneoff_purchases, installments_purchases, purchases_installments_frequency, credit_limit
fviz_nbclust(scale(cc), kmeans, method = "silhouette", k.max=15)
## choose cluster is 13(7 is also high)
fviz_nbclust(scale(cc), kmeans, method = "wss", k.max=15)
## choose cluster is 2, 4, 7, 9
##Choose Dimensions
get_eigenvalue(cc_pca)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 4.6393063270 2.729004e+01 27.29004
## Dim.2 3.4513385266 2.030199e+01 47.59203
## Dim.3 1.4946365060 8.791979e+00 56.38401
## Dim.4 1.2715957527 7.479975e+00 63.86398
## Dim.5 1.0646746466 6.262792e+00 70.12678
## Dim.6 0.9775359195 5.750211e+00 75.87699
## Dim.7 0.8308298888 4.887235e+00 80.76422
## Dim.8 0.7237725420 4.257486e+00 85.02171
## Dim.9 0.6508967563 3.828804e+00 88.85051
## Dim.10 0.5236010589 3.080006e+00 91.93052
## Dim.11 0.4032225561 2.371897e+00 94.30241
## Dim.12 0.3015175024 1.773632e+00 96.07605
## Dim.13 0.2425036148 1.426492e+00 97.50254
## Dim.14 0.2069527563 1.217369e+00 98.71991
## Dim.15 0.1722082797 1.012990e+00 99.73290
## Dim.16 0.0453957174 2.670336e-01 99.99993
## Dim.17 0.0000116488 6.852235e-05 100.00000
##Based on the eigenvalue, we want to choose eigenvalue>1, so we could choose Dimension with 5; however, since we want cumulative variance too small, we choose eigenvalue>0.7, so we choose Dimension with 8 which also has 85% of cumulative variance.
set.seed(820)
k1=kmeans(scale(cc), 7, 25, 25)
fviz_cluster(k1, scale(cc))
table(k1$cluster)
##
## 1 2 3 4 5 6 7
## 2049 629 1186 2846 894 81 1265
k2=kmeans(scale(cc), 9, 25, 25)
fviz_cluster(k2, scale(cc))
table(k2$cluster)
##
## 1 2 3 4 5 6 7 8 9
## 624 1997 884 1179 1119 356 24 2730 37
k3=kmeans(scale(cc), 13, 25, 25)
fviz_cluster(k3, scale(cc))
table(k3$cluster)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 36 465 117 117 22 2240 681 727 1523 1065 816 567 574
In the original model, we think k=9 is the best since each cluster has similar size.
## apply the features -- could use new data, or the original as I do below
c_pcs=predict(cc_pca, newdata=cc)
class(c_pcs)
## [1] "matrix"
c_pcs=as.data.frame(c_pcs)
head(c_pcs)
## PC1 PC2 PC3 PC4 PC5 PC6
## 1 -1.6821263 -1.07639047 0.4884792 0.6655146 0.01822351 0.05062610
## 2 -1.1382313 2.50633663 0.6011787 -0.1204305 0.60576903 -1.13677715
## 3 0.9696298 -0.38349887 0.1023657 1.2091985 -2.17246293 -0.21721033
## 4 -0.8735789 0.04316327 1.4600851 1.1519157 0.29561500 -0.12368210
## 5 -1.5993445 -0.68854210 0.3650732 0.9901763 -0.48701187 0.07505538
## 6 0.2512798 -0.78002309 -1.1324232 0.8372199 0.81832369 0.37869126
## PC7 PC8 PC9 PC10 PC11 PC12
## 1 -0.8290981 0.03930102 -0.1153331 -0.077769908 -0.2351681 -0.05388341
## 2 0.3744861 -0.13240330 -0.6878391 -0.777627545 -0.8713885 -0.60182161
## 3 1.5731698 0.16953891 0.8836781 -0.001938548 -0.7616824 0.68416615
## 4 0.2807436 0.55906737 0.1465558 0.393122152 0.7448159 0.14979517
## 5 -0.7078831 -0.20838724 -0.5845864 -0.121727349 -0.4550714 -0.10623672
## 6 -0.7950723 -0.03940339 -0.4634738 -0.210759945 -0.2012053 0.34111438
## PC13 PC14 PC15 PC16 PC17
## 1 -0.08047446 0.18721959 -0.1512568 -0.04845792 -1.611153e-04
## 2 -0.03786824 0.73621429 0.5319925 0.08288199 3.000363e-06
## 3 0.69592147 -0.04230871 -0.2225563 -0.51141786 2.186024e-05
## 4 -0.15588411 0.42383656 0.1053046 0.01910464 -3.921742e-06
## 5 -0.03878338 0.04079744 -0.2903654 0.07064962 -8.132662e-05
## 6 0.49057113 0.07481376 -0.1697661 -0.13065150 -3.462926e-04
c_pc=c_pcs[, 1:8]
head(c_pc)
## PC1 PC2 PC3 PC4 PC5 PC6
## 1 -1.6821263 -1.07639047 0.4884792 0.6655146 0.01822351 0.05062610
## 2 -1.1382313 2.50633663 0.6011787 -0.1204305 0.60576903 -1.13677715
## 3 0.9696298 -0.38349887 0.1023657 1.2091985 -2.17246293 -0.21721033
## 4 -0.8735789 0.04316327 1.4600851 1.1519157 0.29561500 -0.12368210
## 5 -1.5993445 -0.68854210 0.3650732 0.9901763 -0.48701187 0.07505538
## 6 0.2512798 -0.78002309 -1.1324232 0.8372199 0.81832369 0.37869126
## PC7 PC8
## 1 -0.8290981 0.03930102
## 2 0.3744861 -0.13240330
## 3 1.5731698 0.16953891
## 4 0.2807436 0.55906737
## 5 -0.7078831 -0.20838724
## 6 -0.7950723 -0.03940339
fviz_nbclust(scale(c_pc), kmeans, method = "silhouette", k.max=15)
## choose cluster is 2
fviz_nbclust(scale(c_pc), kmeans, method = "wss", k.max=15)
## choose cluster is 2, 5, 7, 9
set.seed(820)
k4=kmeans(scale(c_pc), 2, 25, 25)
fviz_cluster(k4, scale(c_pc))
table(k4$cluster)
##
## 1 2
## 817 8133
k5=kmeans(scale(c_pc), 5, 25, 25)
fviz_cluster(k5, scale(c_pc))
table(k5$cluster)
##
## 1 2 3 4 5
## 921 1224 727 3878 2200
k6=kmeans(scale(c_pc), 7, 25, 25)
fviz_cluster(k6, scale(c_pc))
table(k6$cluster)
##
## 1 2 3 4 5 6 7
## 71 931 62 726 3793 2192 1175
k7=kmeans(scale(c_pc), 9, 25, 25)
fviz_cluster(k7, scale(c_pc))
table(k7$cluster)
##
## 1 2 3 4 5 6 7 8 9
## 1037 1028 70 700 569 1756 2717 1017 56
Since we don’t want the size of the cluster too small or larger and try to average the size of clusters, so we think the best k is 5 for the PCA model.
##Results business related
##Add clustering back to the original dataset
c_pc$cluster<-k5$cluster
plot_ly(x=c_pc[,1], y=c_pc[,2], z=c_pc[,3], type="scatter3d", mode="markers",color =factor(c_pc$cluster))
cc$cluster<-k5$cluster
##Try different variables plotting with different cluster
ggplot(cc, aes(x=cluster, y=purchases, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0,4000)
## Warning: Removed 447 rows containing non-finite values (stat_boxplot).
ggplot(cc, aes(x=cluster, y=purchases_frequency, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0,1)
ggplot(cc, aes(x=cluster, y=credit_limit, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0,10000)
## Warning: Removed 687 rows containing non-finite values (stat_boxplot).
ggplot(cc, aes(x=cluster, y=balance, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0,7000)
## Warning: Removed 280 rows containing non-finite values (stat_boxplot).
ggplot(cc, aes(x=cluster, y=balance_frequency, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0.7,1)
## Warning: Removed 1465 rows containing non-finite values (stat_boxplot).
ggplot(cc, aes(x=cluster, y=oneoff_purchases, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0,5000)
## Warning: Removed 140 rows containing non-finite values (stat_boxplot).
ggplot(cc, aes(x=cluster, y=purchases_installments_frequency, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))
ggplot(cc, aes(x=cluster, y=purchases_trx, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(1,100)
## Warning: Removed 2181 rows containing non-finite values (stat_boxplot).
ggplot(cc, aes(x=cluster, y=cash_advance, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0,2000)
## Warning: Removed 1456 rows containing non-finite values (stat_boxplot).
ggplot(cc, aes(x=cluster, y=cash_advance_frequency, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0,0.5)
## Warning: Removed 535 rows containing non-finite values (stat_boxplot).
ggplot(cc, aes(x=cluster, y=cash_advance_trx, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0,40)
## Warning: Removed 40 rows containing non-finite values (stat_boxplot).
ggplot(cc, aes(x=cluster, y=installments_purchases, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0, 5000)
## Warning: Removed 53 rows containing non-finite values (stat_boxplot).
ggplot(cc, aes(x=cluster, y=prc_full_payment, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))
For cluster 1: low purchases, high balance, low oneoff_purchases, low prc_full_payment
For cluster 2: high purchases, high purchases_frequency, high credit limit, high oneoff_purchases, high purchases_trx, low cash_advance, low cash_advance_trx, high prc_full_payment
For cluster 3: low purchases, low purchase_frequency, low purchases_installments_frequency, low purchases_trx, low installments_purchases, low prc_full_payment, low oneoff_purchases_fr
For cluster 4: low credit_limit, low balance_frequency, low installments_purchases, low purchases_trx, high cash_advance_frequency
For cluster 5: high purchases_frequency, high purchases_installments_frequency, low cash_advance_trx, high installments_purchases, high purchases_trx, low cash_advances
Summarize observations to features cluster1: users for daily purchases
cluster2: premium users with high spending behaviors
cluster3: inactive users
cluster4: new cardholders
cluster5: installment users
##tSNE
cc2<-cc%>%select(-cluster)
cc_tsne=Rtsne(cc2,
verbose=TRUE,
max_iter=500,
check_duplicates=FALSE)
## Performing PCA
## Read the 8950 x 17 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 2.70 seconds (sparsity = 0.013948)!
## Learning embedding...
## Iteration 50: error is 96.446933 (50 iterations in 2.59 seconds)
## Iteration 100: error is 84.272072 (50 iterations in 2.95 seconds)
## Iteration 150: error is 80.927107 (50 iterations in 2.49 seconds)
## Iteration 200: error is 80.573611 (50 iterations in 2.39 seconds)
## Iteration 250: error is 80.497217 (50 iterations in 2.43 seconds)
## Iteration 300: error is 2.825278 (50 iterations in 2.20 seconds)
## Iteration 350: error is 2.333653 (50 iterations in 2.29 seconds)
## Iteration 400: error is 2.059627 (50 iterations in 2.63 seconds)
## Iteration 450: error is 1.882188 (50 iterations in 2.19 seconds)
## Iteration 500: error is 1.758401 (50 iterations in 2.49 seconds)
## Fitting performed in 24.66 seconds.
## remember that this is for plotting, so we can get the 2d space from Y
class(cc_tsne)
## [1] "list"
names(cc_tsne)
## [1] "N" "Y" "costs"
## [4] "itercosts" "origD" "perplexity"
## [7] "theta" "max_iter" "stop_lying_iter"
## [10] "mom_switch_iter" "momentum" "final_momentum"
## [13] "eta" "exaggeration_factor"
## lets create the plot
tsne_proj=cc_tsne$Y
class(tsne_proj)
## [1] "matrix"
dim(tsne_proj)
## [1] 8950 2
head(tsne_proj)
## [,1] [,2]
## [1,] -21.6492006 15.479782
## [2,] 9.0345892 -7.445791
## [3,] -0.8264224 -12.061120
## [4,] -7.0738783 -10.785271
## [5,] 1.7832611 23.622041
## [6,] 4.9834984 10.068784
nrow(cc) ==nrow(tsne_proj)
## [1] TRUE
tsne_df=as.data.frame(tsne_proj)
plot(tsne_df$V1, tsne_df$V2, type="p", pch=19)
## just a big hairball, but lets clean it up and map onto it
cc_final=cbind(cc, tsne_df)
fviz_nbclust(scale(tsne_proj), kmeans, method = "silhouette", k.max=15)
## choose cluster is 6
fviz_nbclust(scale(tsne_proj), kmeans, method = "wss", k.max=15)
## choose cluster is 3, 4, 7, 8
z2<-scale(as.data.frame(tsne_proj))
k8=kmeans(z2, 3, 25, 25)
fviz_cluster(k8, z2)
table(k8$cluster)
##
## 1 2 3
## 2953 2941 3056
k9=kmeans(z2, 4, 25, 25)
fviz_cluster(k9, z2)
table(k9$cluster)
##
## 1 2 3 4
## 2189 2409 2153 2199
k10=kmeans(z2, 6, 25, 25)
fviz_cluster(k10, z2)
table(k10$cluster)
##
## 1 2 3 4 5 6
## 1338 1486 1485 1491 1588 1562
k11=kmeans(z2, 7, 25, 25)
fviz_cluster(k11, z2)
table(k11$cluster)
##
## 1 2 3 4 5 6 7
## 1273 1372 1133 1139 1400 1339 1294
k12=kmeans(z2, 8, 25, 25)
fviz_cluster(k12, z2)
table(k12$cluster)
##
## 1 2 3 4 5 6 7 8
## 934 1364 1084 1175 1073 1229 1025 1066
k13=kmeans(z2, 5, 25, 25)
fviz_cluster(k13, z2)
cc2$cluster<-k13$cluster
##Try different variables plotting with different cluster
ggplot(cc2, aes(x=cluster, y=purchases, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0,4000)
## Warning: Removed 447 rows containing non-finite values (stat_boxplot).
ggplot(cc2, aes(x=cluster, y=purchases_frequency, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0,1)
ggplot(cc2, aes(x=cluster, y=credit_limit, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0,10000)
## Warning: Removed 687 rows containing non-finite values (stat_boxplot).
ggplot(cc2, aes(x=cluster, y=balance, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0,6000)
## Warning: Removed 420 rows containing non-finite values (stat_boxplot).
ggplot(cc2, aes(x=cluster, y=balance_frequency, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0.5,1)
## Warning: Removed 914 rows containing non-finite values (stat_boxplot).
ggplot(cc2, aes(x=cluster, y=oneoff_purchases, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0,2000)
## Warning: Removed 691 rows containing non-finite values (stat_boxplot).
ggplot(cc2, aes(x=cluster, y=purchases_installments_frequency, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))
ggplot(cc2, aes(x=cluster, y=purchases_trx, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(1,100)
## Warning: Removed 2181 rows containing non-finite values (stat_boxplot).
ggplot(cc2, aes(x=cluster, y=cash_advance, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0,2000)
## Warning: Removed 1456 rows containing non-finite values (stat_boxplot).
ggplot(cc2, aes(x=cluster, y=cash_advance_frequency, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0,0.5)
## Warning: Removed 535 rows containing non-finite values (stat_boxplot).
ggplot(cc2, aes(x=cluster, y=cash_advance_trx, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0,30)
## Warning: Removed 80 rows containing non-finite values (stat_boxplot).
ggplot(cc2, aes(x=cluster, y=installments_purchases, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))+
ylim(0, 4000)
## Warning: Removed 82 rows containing non-finite values (stat_boxplot).
ggplot(cc2, aes(x=cluster, y=prc_full_payment, fill=factor(cluster)))+
geom_boxplot()+
theme(panel.background = element_rect(fill="white"))
For cluster 1: low purchases/freq, normal credit 2000, normal balance, high freq 1700, low one-off, low installment /freq, low full pay For cluster 2: high purchases/freq, high credit limit, low balance, high oneoff_purchases, low cash_advance, low cash_advance_trx, high installment, high Full payment For cluster 3: normal purchases, 400, low credit limit, low balance, low one-off payment, low cash, normal full payment percentage For cluster 4: normal purchase 200-1000, normal freq, mid credit_limit, low balance, low one-off purchase, normal installments_purchases, low cash freq, low cash For cluster 5: normal purchase ~ 0-800, high credit limit ~2000-5000, high balance/freq, low one-off, low installments_purchases/freq, high cash_advances / freq~, low full payment
tSNE method, features not clear.
##try text mining
information<-list(
colnum = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30),
mains = c(
"CUST_ID: Identification of Credit Card holder",
"BALANCE: Balance amount left in their account to make purchases",
"BALANCE_FREQUENCY: How frequency the Balance is updated, a score between 0 and 1",
"PURCHASES: Amount of purchases made from the account",
"ONEOFF_PURCHASES: Maximum purchases amount did in one-go",
"INSTALLMENTS_PURCHASES: Amount of purchases done in installments",
"CASH_ADVANCE: Cash in advance given by the users",
"PURCHASES_FREQUENCY: How frequency the Purchases are being made, score
between 0 and 1",
"ONEOFF_PURCHASES_FREQUENCY: How frequency Purchases are happening in
one-go",
"PURCHASES_INSTALLMENTS_FREQUENCY: How frequency purchases in
installments are being done",
"CASH_ADVANCE_FREQUENCY: How frequency the cash in advance being paid",
"CASH_ADVANCE_TRX: Number of Transactions made with Cash in Advance",
"PURCHASES_TRX: Number of purchases transactions made",
"CREDIT_LIMIT: Limit of Credit Card for users",
"PAYMENTS: Amount of Payments done by the users",
"MINIMUM_PAYMENTS: Minimum amount of payments made by the users",
"PRC_FULL_PAYMENT: Percent of full payment paid by the users",
"TENURE: Tenure of credit card service for users",
"overview: The problem that motivates us is the segmentation of customers that could be very necessary and powerful to define marketing strategies. The credit card usage behavior of customers with 17 behavioral features seems to be a perfect dataset for us to explore the customers segmentation on their purchasing behaviors with credit cards, consider score",
"title: Credit Card Customers Segmentation",
"dataset: The sample Dataset summarizes the usage behavior of 8950 active credit card holders during the last 6 months. The file is at a customers level with 18 behavioral variables.",
"Focus on variables balance, balance_frequency, purchases, purchases_frequency, oneoff_purchases, oneoff_purchases_frequency, installments_purchases.",
"Baseline clustering:
Silhouette score: clustering is 13(7 and 9 is also very high)
WSS- clustering is 2, 4, 7, 9
Compare those cluster plots, we think k=9 is the best since each cluster has a similar size in the baseline clustering segmentaion. ",
"Dimension Reduction (PCA model)
Compare Eigenvalue and Cumulative Variance for dimension",
"eigenvalue, we want to eigenvalue>1 for dimension, Dimension with 5; however, since we don’t want cumulative variance too small, eigenvalue>0.7 for dimension, so we choose Dimension with 8 which also has 85% of cumulative variance.",
"Clustering for PCA model: Silhouette score- clustering is 2
WSS- clustering is 2, 5, 7, 9
Since we don't want the size of the clustering too small or larger and try to average the size of clustering, so the best k is 5 for the PCA model.
Our best model overall:Thus, our best model is Dimension 8 and clustering 5. ",
"Add clustering back to the original dataset, compare variables for clustering based on different boxplots, the characteristics of each clustering. see whether oneoff obvious
1: users for daily purchases
2: premium card users with high spending behaviors
3: inactive card users
4: new card users
5: installment card users",
"Based on those 5 features, we design different market strategies for various target customers.",
"For cluster 1, 2, 3, 4, 5, consider credit score",
"comments:marketing strategies based on the profiles of the customers is your goal, create a marketing strategy around segmentation, credit score guess"
)
)
information$mains=str_to_lower(information$mains)
information=as.data.frame(information)
information$mains=as.character(information$mains)
information$mains = gsub("_", " ", information$mains)
information$mains = removeNumbers(information$mains)
credit_tokens = information %>%
unnest_tokens(token, mains, strip_punct=T) %>%
anti_join(get_stopwords(), get_stopwords(source="snowball"), get_stopwords(source="stopwords-iso"), get_stopwords(source="smart"), by=c("token"="word"))
##address these common words that are not adding value
credit_tokens %>%
count(token, sort=T) %>%
head(n=25)%>%
ggplot(aes(x=token, y=n))+
geom_bar(stat="identity", fill="#E7B800")+
theme(panel.background = element_rect(fill="white"))+
theme(axis.text.x=element_text(angle=30))##get useful top words
tidy_tokens=credit_tokens%>%
count(token, sort=T)
par(mfrow = c(1,1))
wordcloud(words=tidy_tokens$token,
freq=tidy_tokens$n,
min.freq=4,
max.words=45,
colors=brewer.pal(8, "Dark2"))